In [2]:
import numpy as np
import pandas as pd
In [3]:
# Set some Pandas options
pd.set_option('html', False)
pd.set_option('max_columns', 30)
pd.set_option('max_rows', 10)
Data downloaded from: http://www.dshs.state.tx.us/thcic/hospitals/Inpatientpudf.shtm
Field widths and names from: http://www.dshs.state.tx.us/THCIC/Hospitals/UserManual2007.pdf
In [4]:
widths=[6,6,55,1,1, 1,1,1,1,1,
1,5,2,1,1, 1,2,5,2,3,
2,1,4,2,2, 1,1,2,2,3,
12,12,12,12,12, 12,12,12,12,12,
12,12,12,12,12, 12,12,12,12,12,
12,12,12,12,12, 12,12,12,12,12,
12,12,12,12,12, 12,6,6,6,6,
6,6,6,6,6, 6,6,6,6,6,
6,6,6,6,6, 6,6,6,6,6,
6,6,7,4,5, 7,4,5,7,4,
5,7,4,5,7, 4,5,7,4,5,
7,4,5,7,4, 5,7,4,5,7,
4,5,7,4,5, 7,4,5,7,4,
5,7,4,5,7, 4,5,7,4,5,
7,4,5,7,4, 5,7,4,5,7,
4,5,7,4,5, 7,4,5,7,4,
5,7,4,5,7, 4,5,]
len(widths)
Out[4]:
In [5]:
names=range(1,168)
names[1-1]='Discharge'
names[2-1]='THCIC_ID'
names[3-1]='Provider_Name'
names[14-1]='Sex'
names[17-1]='Pat_State'
names[18-1]='Pat_ZIP'
names[22-1]='Admit_Weedkay'
names[23-1]='Length_of_Stay'
names[24-1]='Pat_Age'
names[25-1]='Pat_Status'
names[26-1]='Race'
names[61-1]='Total_Charges'
names[67-1]='Admitting_Diagnosis'
names[68-1]='Princ_Diag_Code'
for id in range(69-1,92-1+1):
names[id]='Oth_Diag_Code_'+str(id-67)
#names[250-1]='Risk_Mortality'
In [6]:
data = pd.read_fwf('/var/datasets/dshs/CD2007Q1/PUDF_base1q2007.txt',
widths=widths, names=names,
header=None)
In [7]:
data.info()
In [22]:
data.to_hdf('/var/datasets/dshs/CD2007Q1/PUDF_base1q2007.h5','data',mode='w')
In [ ]: